/**
 * MOAB, a Mesh-Oriented datABase, is a software component for creating,
 * storing and accessing finite element mesh data.
 *
 * Copyright 2004 Sandia Corporation.  Under the terms of Contract
 * DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government
 * retains certain rights in this software.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 */

/** TSTT Mesh Interface brick mesh performance test
 *
 * This program tests TSTT mesh interface functions used to create a square structured
 * mesh.  Boilerplate taken from tstt mesh interface test in MOAB and performance test in MOAB
 *
 */

// Different platforms follow different conventions for usage
#if !defined( _NT ) && !defined( _MSC_VER ) && !defined( __MINGW32__ )
#include <sys/resource.h>
#endif
#ifdef SOLARIS
extern "C" int getrusage( int, struct rusage* );
#ifndef RUSAGE_SELF
#include </usr/ucbinclude/sys/rusage.h>
#endif
#endif

#include <cstdlib>
#include <cstdio>
#include <iostream>

//-------------------------------------------------------
// CHANGE THESE DEFINITIONS TO SUIT YOUR IMPLEMENTATION

// #include here any files with declarations needed to instantiate your
// implementation instance
#include "TSTT_MOAB.hh"

// define IMPLEMENTATION_CLASS to be the namespace::class of your implementation
#define IMPLEMENTATION_CLASS TSTT_MOAB::MoabMesh
//-------------------------------------------------------

#define CAST_INTERFACE( var_in, var_out, iface )                           \
    TSTT::iface var_out;                                                   \
    try                                                                    \
    {                                                                      \
        ( var_out ) = var_in;                                              \
    }                                                                      \
    catch( TSTT::Error )                                                   \
    {                                                                      \
        cerr << "Error: current interface doesn't support iface." << endl; \
        return;                                                            \
    }

#define CAST_MINTERFACE( var_in, var_out, iface )                          \
    TSTTM::iface var_out;                                                  \
    try                                                                    \
    {                                                                      \
        ( var_out ) = var_in;                                              \
    }                                                                      \
    catch( TSTT::Error )                                                   \
    {                                                                      \
        cerr << "Error: current interface doesn't support iface." << endl; \
        return;                                                            \
    }

#include <iostream>
#include "TSTT.hh"
#include "TSTTM.hh"

// needed to get the proper size for handles
typedef void* Entity_Handle;

using namespace std;

#define ARRAY_PTR( array, type )  ( reinterpret_cast< ( type )* >( ( array )._get_ior()->d_firstElement ) )
#define HANDLE_ARRAY_PTR( array ) ( reinterpret_cast< Entity_Handle* >( ( array )._get_ior()->d_firstElement ) )
#define ARRAY_SIZE( array )       ( ( array )._is_nil() ? 0 : ( array ).upper( 0 ) - ( array ).lower( 0 ) + 1 )
#define CHECK_SIZE( array, size )                                     \
    if( ( array )._is_nil() || ARRAY_SIZE( array ) == 0 )             \
        ( array ) = ( array ).create1d( size );                       \
    else if( ARRAY_SIZE( array ) < ( size ) )                         \
    {                                                                 \
        cerr << "Array passed in is non-zero but too short." << endl; \
        assert( false );                                              \
    }

double LENGTH = 1.0;

// forward declare some functions
void query_vert_to_elem( TSTTM::Mesh& mesh );
void query_elem_to_vert( TSTTM::Mesh& mesh );
void print_time( const bool print_em, double& tot_time, double& utime, double& stime );
void build_connect( const int nelem, const int vstart, int*& connect );
void testB( TSTTM::Mesh& mesh, const int nelem, sidl::array< double >& coords, const int* connect );
void testC( TSTTM::Mesh& mesh, const int nelem, sidl::array< double >& coords );
void compute_edge( double* start, const int nelem, const double xint, const int stride );
void compute_face( double* a, const int nelem, const double xint, const int stride1, const int stride2 );
void build_coords( const int nelem, sidl::array< double >& coords );
void build_connect( const int nelem, const int vstart, int*& connect );

int main( int argc, char* argv[] )
{
    int nelem = 20;
    if( argc < 3 )
    {
        std::cout << "Usage: " << argv[0] << " <ints_per_side> <A|B|C>" << std::endl;
        return 1;
    }

    char which_test = '\0';

    sscanf( argv[1], "%d", &nelem );
    sscanf( argv[2], "%c", &which_test );

    if( which_test != 'B' && which_test != 'C' )
    {
        std::cout << "Must indicate B or C for test." << std::endl;
        return 1;
    }

    std::cout << "number of elements: " << nelem << "; test " << which_test << std::endl;

    // pre-build the coords array
    sidl::array< double > coords;
    build_coords( nelem, coords );
    assert( NULL != coords );

    int* connect = NULL;
    build_connect( nelem, 1, connect );

    // create an implementation
    TSTTM::Mesh mesh = IMPLEMENTATION_CLASS::_create();

    switch( which_test )
    {
        case 'B':
            // test B: create mesh using bulk interface
            testB( mesh, nelem, coords, connect );
            break;

        case 'C':
            // test C: create mesh using individual interface
            testC( mesh, nelem, coords );
            break;
    }

    return 0;
}

void testB( TSTTM::Mesh& mesh, const int nelem, sidl::array< double >& coords, const int* connect )
{
    double utime, stime, ttime0, ttime1, ttime2, ttime3;

    print_time( false, ttime0, utime, stime );
    int num_verts = ( nelem + 1 ) * ( nelem + 1 ) * ( nelem + 1 );
    int num_elems = nelem * nelem * nelem;

    // create vertices as a block
    CAST_MINTERFACE( mesh, mesh_arrmod, ArrMod );
    sidl::array< Entity_Handle > sidl_vertices, dum_handles;
    CHECK_SIZE( dum_handles, num_verts );
    int sidl_vertices_size;

    try
    {
        mesh_arrmod.createVtxArr( num_verts, TSTTM::StorageOrder_BLOCKED, coords, 3 * num_verts, sidl_vertices,
                                  sidl_vertices_size );
    }
    catch( TSTT::Error& /* err */ )
    {
        cerr << "Couldn't create vertices in bulk call" << endl;
        return;
    }

    // need to explicitly fill connectivity array, since we don't know
    // the format of entity handles
    sidl::array< Entity_Handle > sidl_connect;
    int sidl_connect_size = 8 * num_elems;
    CHECK_SIZE( sidl_connect, 8 * num_elems );
    Entity_Handle* sidl_connect_ptr = HANDLE_ARRAY_PTR( sidl_connect );

    Entity_Handle* sidl_vertices_ptr = HANDLE_ARRAY_PTR( sidl_vertices );
    for( int i = 0; i < sidl_connect_size; i++ )
    {
        // use connect[i]-1 because we used starting vertex index (vstart) of 1
        assert( connect[i] - 1 < num_verts );
        sidl_connect_ptr[i] = sidl_vertices_ptr[connect[i] - 1];
    }

    // create the entities
    sidl::array< Entity_Handle > new_hexes;
    int new_hexes_size;
    sidl::array< TSTTM::CreationStatus > status;
    int status_size;

    try
    {
        mesh_arrmod.createEntArr( TSTTM::EntityTopology_HEXAHEDRON, sidl_connect, sidl_connect_size, new_hexes,
                                  new_hexes_size, status, status_size );
    }
    catch( TSTT::Error& /* err */ )
    {
        cerr << "Couldn't create hex elements in bulk call" << endl;
        return;
    }

    print_time( false, ttime1, utime, stime );

    // query the mesh 2 ways
    query_elem_to_vert( mesh );

    print_time( false, ttime2, utime, stime );

    query_vert_to_elem( mesh );

    print_time( false, ttime3, utime, stime );

    std::cout << "TSTT/MOAB ucd blocked: nelem, construct, e_to_v query, v_to_e query = " << nelem << ", "
              << ttime1 - ttime0 << ", " << ttime2 - ttime1 << ", " << ttime3 - ttime2 << " seconds" << std::endl;
}

void testC( TSTTM::Mesh& mesh, const int nelem, sidl::array< double >& coords )
{
    double utime, stime, ttime0, ttime1, ttime2, ttime3;
    print_time( false, ttime0, utime, stime );

    CAST_MINTERFACE( mesh, mesh_arrmod, ArrMod );

    // need some dimensions
    int numv      = nelem + 1;
    int numv_sq   = numv * numv;
    int num_verts = numv * numv * numv;
#define VINDEX( i, j, k ) ( ( i ) + ( (j)*numv ) + ( (k)*numv_sq ) )

    // array to hold vertices created individually
    sidl::array< Entity_Handle > sidl_vertices;
    // int sidl_vertices_size = num_verts;
    CHECK_SIZE( sidl_vertices, num_verts );

    // temporary array to hold vertex positions for single vertex
    sidl::array< double > tmp_coords;
    int tmp_coords_size = 3;
    CHECK_SIZE( tmp_coords, 3 );
    double* dum_coords = ARRAY_PTR( tmp_coords, double );

    // get direct pointer to coordinate array
    double* coords_ptr = ARRAY_PTR( coords, double );

    for( int i = 0; i < num_verts; i++ )
    {

        // temporary array to hold (single) vertices
        sidl::array< Entity_Handle > tmp_vertices;
        int tmp_vertices_size = 0;

        // create the vertex
        dum_coords[0] = coords_ptr[i];
        dum_coords[1] = coords_ptr[num_verts + i];
        dum_coords[2] = coords_ptr[2 * num_verts + i];
        try
        {
            mesh_arrmod.createVtxArr( 1, TSTTM::StorageOrder_BLOCKED, tmp_coords, tmp_coords_size, tmp_vertices,
                                      tmp_vertices_size );
        }
        catch( TSTT::Error& /* err */ )
        {
            cerr << "Couldn't create vertex in individual call" << endl;
            return;
        }

        // assign into permanent vertex array
        sidl_vertices.set( i, tmp_vertices.get( 0 ) );
    }

    // get vertex array pointer for reading into tmp_conn
    Entity_Handle* tmp_sidl_vertices = HANDLE_ARRAY_PTR( sidl_vertices );

    for( int i = 0; i < nelem; i++ )
    {
        for( int j = 0; j < nelem; j++ )
        {
            for( int k = 0; k < nelem; k++ )
            {

                sidl::array< Entity_Handle > tmp_conn;
                int tmp_conn_size = 8;
                CHECK_SIZE( tmp_conn, 8 );

                int vijk = VINDEX( i, j, k );
                tmp_conn.set( 0, tmp_sidl_vertices[vijk] );
                tmp_conn.set( 1, tmp_sidl_vertices[vijk + 1] );
                tmp_conn.set( 2, tmp_sidl_vertices[vijk + 1 + numv] );
                tmp_conn.set( 3, tmp_sidl_vertices[vijk + numv] );
                tmp_conn.set( 4, tmp_sidl_vertices[vijk + numv * numv] );
                tmp_conn.set( 5, tmp_sidl_vertices[vijk + 1 + numv * numv] );
                tmp_conn.set( 6, tmp_sidl_vertices[vijk + 1 + numv + numv * numv] );
                tmp_conn.set( 7, tmp_sidl_vertices[vijk + numv + numv * numv] );

                // create the entity

                sidl::array< Entity_Handle > new_hex;
                int new_hex_size = 0;
                sidl::array< TSTTM::CreationStatus > status;
                int status_size = 0;

                try
                {
                    mesh_arrmod.createEntArr( TSTTM::EntityTopology_HEXAHEDRON, tmp_conn, tmp_conn_size, new_hex,
                                              new_hex_size, status, status_size );
                }
                catch( TSTT::Error& /* err */ )
                {
                    cerr << "Couldn't create hex element in individual call" << endl;
                    return;
                }
            }
        }
    }

    print_time( false, ttime1, utime, stime );

    // query the mesh 2 ways
    query_elem_to_vert( mesh );

    print_time( false, ttime2, utime, stime );

    query_vert_to_elem( mesh );

    print_time( false, ttime3, utime, stime );

    std::cout << "TSTT/MOAB ucd indiv: nelem, construct, e_to_v query, v_to_e query = " << nelem << ", "
              << ttime1 - ttime0 << ", " << ttime2 - ttime1 << ", " << ttime3 - ttime2 << " seconds" << std::endl;
}

void query_elem_to_vert( TSTTM::Mesh& mesh )
{
    sidl::array< Entity_Handle > all_hexes;
    int all_hexes_size;
    CAST_MINTERFACE( mesh, mesh_ent, Entity );

    // get all the hex elements
    try
    {
        mesh.getEntities( 0, TSTTM::EntityType_REGION, TSTTM::EntityTopology_HEXAHEDRON, all_hexes, all_hexes_size );
    }
    catch( TSTT::Error& /* err */ )
    {
        cerr << "Couldn't get all hex elements in query_mesh" << endl;
        return;
    }

    try
    {
        // set up some tmp arrays and array ptrs
        Entity_Handle* all_hexes_ptr = HANDLE_ARRAY_PTR( all_hexes );

        // now loop over elements
        for( int i = 0; i < all_hexes_size; i++ )
        {
            sidl::array< int > dum_offsets;
            sidl::array< Entity_Handle > dum_connect;
            int dum_connect_size = 0;
            // get the connectivity of this element; will allocate space on 1st iteration,
            // but will have correct size on subsequent ones
            mesh_ent.getEntAdj( all_hexes_ptr[i], TSTTM::EntityType_VERTEX, dum_connect, dum_connect_size );

            // get vertex coordinates; ; will allocate space on 1st iteration,
            // but will have correct size on subsequent ones
            sidl::array< double > dum_coords;
            int dum_coords_size       = 0;
            TSTTM::StorageOrder order = TSTTM::StorageOrder_UNDETERMINED;
            mesh.getVtxArrCoords( dum_connect, dum_connect_size, order, dum_coords, dum_coords_size );

            assert( 24 == dum_coords_size && ARRAY_SIZE( dum_coords ) == 24 );
            double* dum_coords_ptr = ARRAY_PTR( dum_coords, double );
            double centroid[3]     = { 0.0, 0.0, 0.0 };
            if( order == TSTTM::StorageOrder_BLOCKED )
            {
                for( int j = 0; j < 8; j++ )
                {
                    centroid[0] += dum_coords_ptr[j];
                    centroid[1] += dum_coords_ptr[8 + j];
                    centroid[2] += dum_coords_ptr[16 + j];
                    centroid[0] += dum_coords.get( j );
                    centroid[1] += dum_coords.get( 8 + j );
                    centroid[2] += dum_coords.get( 16 + j );
                }
            }
            else
            {
                for( int j = 0; j < 8; j++ )
                {
                    centroid[0] += dum_coords_ptr[3 * j];
                    centroid[1] += dum_coords_ptr[3 * j + 1];
                    centroid[2] += dum_coords_ptr[3 * j + 2];
                }
            }
        }
    }
    catch( TSTT::Error& /* err */ )
    {
        cerr << "Problem getting connectivity or vertex coords." << endl;
        return;
    }
}

void query_vert_to_elem( TSTTM::Mesh& mesh )
{
    sidl::array< Entity_Handle > all_verts;
    int all_verts_size;
    CAST_MINTERFACE( mesh, mesh_ent, Entity );

    // get all the vertices elements
    try
    {
        mesh.getEntities( 0, TSTTM::EntityType_VERTEX, TSTTM::EntityTopology_POINT, all_verts, all_verts_size );
    }
    catch( TSTT::Error& /* err */ )
    {
        cerr << "Couldn't get all vertices in query_vert_to_elem" << endl;
        return;
    }

    try
    {
        // set up some tmp arrays and array ptrs
        Entity_Handle* all_verts_ptr = HANDLE_ARRAY_PTR( all_verts );

        // now loop over vertices
        for( int i = 0; i < all_verts_size; i++ )
        {
            sidl::array< Entity_Handle > dum_hexes;
            int dum_hexes_size;

            // get the connectivity of this element; will have to allocate space on every
            // iteration, since size can vary
            mesh_ent.getEntAdj( all_verts_ptr[i], TSTTM::EntityType_REGION, dum_hexes, dum_hexes_size );
        }
    }
    catch( TSTT::Error& /* err */ )
    {
        cerr << "Problem getting connectivity or vertex coords." << endl;
        return;
    }
}

void print_time( const bool print_em, double& tot_time, double& utime, double& stime )
{
    struct rusage r_usage;
    getrusage( RUSAGE_SELF, &r_usage );
    utime    = (double)r_usage.ru_utime.tv_sec + ( (double)r_usage.ru_utime.tv_usec / 1.e6 );
    stime    = (double)r_usage.ru_stime.tv_sec + ( (double)r_usage.ru_stime.tv_usec / 1.e6 );
    tot_time = utime + stime;
    if( print_em )
        std::cout << "User, system, total time = " << utime << ", " << stime << ", " << tot_time << std::endl;
#ifndef LINUX
    std::cout << "Max resident set size = " << r_usage.ru_maxrss * 4096 << " bytes" << std::endl;
    std::cout << "Int resident set size = " << r_usage.ru_idrss << std::endl;
#else
    system( "ps o args,drs,rss | grep perf | grep -v grep" );  // RedHat 9.0 doesnt fill in actual
                                                               // memory data
#endif
}

void compute_edge( double* start, const int nelem, const double xint, const int stride )
{
    for( int i = 1; i < nelem; i++ )
    {
        start[i * stride]                     = start[0] + i * xint;
        start[nelem + 1 + i * stride]         = start[nelem + 1] + i * xint;
        start[2 * ( nelem + 1 ) + i * stride] = start[2 * ( nelem + 1 )] + i * xint;
    }
}

void compute_face( double* a, const int nelem, const double xint, const int stride1, const int stride2 )
{
    // 2D TFI on a face starting at a, with strides stride1 in ada and stride2 in tse
    for( int j = 1; j < nelem; j++ )
    {
        double tse = j * xint;
        for( int i = 1; i < nelem; i++ )
        {
            double ada = i * xint;

            a[i * stride1 + j * stride2] =
                ( 1.0 - ada ) * a[i * stride1] + ada * a[i * stride1 + nelem * stride2] +
                ( 1.0 - tse ) * a[j * stride2] + tse * a[j * stride2 + nelem * stride1] -
                ( 1.0 - tse ) * ( 1.0 - ada ) * a[0] - ( 1.0 - tse ) * ada * a[nelem * stride1] -
                tse * ( 1.0 - ada ) * a[nelem * stride2] - tse * ada * a[nelem * ( stride1 + stride2 )];
            a[nelem + 1 + i * stride1 + j * stride2] =
                ( 1.0 - ada ) * a[nelem + 1 + i * stride1] + ada * a[nelem + 1 + i * stride1 + nelem * stride2] +
                ( 1.0 - tse ) * a[nelem + 1 + j * stride2] + tse * a[nelem + 1 + j * stride2 + nelem * stride1] -
                ( 1.0 - tse ) * ( 1.0 - ada ) * a[nelem + 1 + 0] -
                ( 1.0 - tse ) * ada * a[nelem + 1 + nelem * stride1] -
                tse * ( 1.0 - ada ) * a[nelem + 1 + nelem * stride2] -
                tse * ada * a[nelem + 1 + nelem * ( stride1 + stride2 )];
            a[2 * ( nelem + 1 ) + i * stride1 + j * stride2] =
                ( 1.0 - ada ) * a[2 * ( nelem + 1 ) + i * stride1] +
                ada * a[2 * ( nelem + 1 ) + i * stride1 + nelem * stride2] +
                ( 1.0 - tse ) * a[2 * ( nelem + 1 ) + j * stride2] +
                tse * a[2 * ( nelem + 1 ) + j * stride2 + nelem * stride1] -
                ( 1.0 - tse ) * ( 1.0 - ada ) * a[2 * ( nelem + 1 ) + 0] -
                ( 1.0 - tse ) * ada * a[2 * ( nelem + 1 ) + nelem * stride1] -
                tse * ( 1.0 - ada ) * a[2 * ( nelem + 1 ) + nelem * stride2] -
                tse * ada * a[2 * ( nelem + 1 ) + nelem * ( stride1 + stride2 )];
        }
    }
}

void build_coords( const int nelem, sidl::array< double >& coords )
{
    double ttime0, ttime1, utime1, stime1;
    print_time( false, ttime0, utime1, stime1 );

    // allocate the memory
    int numv     = nelem + 1;
    int numv_sq  = numv * numv;
    int tot_numv = numv * numv * numv;
    CHECK_SIZE( coords, 3 * tot_numv );
    double* coords_ptr = ARRAY_PTR( coords, double );

// use FORTRAN-like indexing
#define VINDEX( i, j, k ) ( ( i ) + ( (j)*numv ) + ( (k)*numv_sq ) )
    int idx;
    double scale1, scale2, scale3;
    // use these to prevent optimization on 1-scale, etc (real map wouldn't have
    // all these equal)
    scale1 = LENGTH / nelem;
    scale2 = LENGTH / nelem;
    scale3 = LENGTH / nelem;

#ifdef REALTFI
    // use a real TFI xform to compute coordinates
    // initialize positions of 8 corners
    coords_ptr[VINDEX( 0, 0, 0 )]                         = coords_ptr[VINDEX( 0, 0, 0 ) + nelem + 1] =
        coords_ptr[VINDEX( 0, 0, 0 ) + 2 * ( nelem + 1 )] = 0.0;
    coords_ptr[VINDEX( 0, nelem, 0 )] = coords_ptr[VINDEX( 0, nelem, 0 ) + 2 * ( nelem + 1 )] = 0.0;
    coords_ptr[VINDEX( 0, nelem, 0 ) + nelem + 1]                                             = LENGTH;
    coords_ptr[VINDEX( 0, 0, nelem )] = coords_ptr[VINDEX( 0, 0, nelem ) + nelem + 1] = 0.0;
    coords_ptr[VINDEX( 0, 0, nelem ) + 2 * ( nelem + 1 )]                             = LENGTH;
    coords_ptr[VINDEX( 0, nelem, nelem )]                                             = 0.0;
    coords_ptr[VINDEX( 0, nelem, nelem ) + nelem + 1] = coords_ptr[VINDEX( 0, nelem, nelem ) + 2 * ( nelem + 1 )] =
        LENGTH;
    coords_ptr[VINDEX( nelem, 0, 0 )]             = LENGTH;
    coords_ptr[VINDEX( nelem, 0, 0 ) + nelem + 1] = coords_ptr[VINDEX( nelem, 0, 0 ) + 2 * ( nelem + 1 )] = 0.0;
    coords_ptr[VINDEX( nelem, 0, nelem )] = coords_ptr[VINDEX( nelem, 0, nelem ) + 2 * ( nelem + 1 )] = LENGTH;
    coords_ptr[VINDEX( nelem, 0, nelem ) + nelem + 1]                                                 = 0.0;
    coords_ptr[VINDEX( nelem, nelem, 0 )] = coords_ptr[VINDEX( nelem, nelem, 0 ) + nelem + 1] = LENGTH;
    coords_ptr[VINDEX( nelem, nelem, 0 ) + 2 * ( nelem + 1 )]                                 = 0.0;
    coords_ptr[VINDEX( nelem, nelem, nelem )] = coords_ptr[VINDEX( nelem, nelem, nelem ) + nelem + 1] =
        coords_ptr[VINDEX( nelem, nelem, nelem ) + 2 * ( nelem + 1 )] = LENGTH;

    // compute edges
    // i (stride=1)
    compute_edge( &coords_ptr[VINDEX( 0, 0, 0 )], nelem, scale1, 1 );
    compute_edge( &coords_ptr[VINDEX( 0, nelem, 0 )], nelem, scale1, 1 );
    compute_edge( &coords_ptr[VINDEX( 0, 0, nelem )], nelem, scale1, 1 );
    compute_edge( &coords_ptr[VINDEX( 0, nelem, nelem )], nelem, scale1, 1 );
    // j (stride=numv)
    compute_edge( &coords_ptr[VINDEX( 0, 0, 0 )], nelem, scale1, numv );
    compute_edge( &coords_ptr[VINDEX( nelem, 0, 0 )], nelem, scale1, numv );
    compute_edge( &coords_ptr[VINDEX( 0, 0, nelem )], nelem, scale1, numv );
    compute_edge( &coords_ptr[VINDEX( nelem, 0, nelem )], nelem, scale1, numv );
    // k (stride=numv^2)
    compute_edge( &coords_ptr[VINDEX( 0, 0, 0 )], nelem, scale1, numv_sq );
    compute_edge( &coords_ptr[VINDEX( nelem, 0, 0 )], nelem, scale1, numv_sq );
    compute_edge( &coords_ptr[VINDEX( 0, nelem, 0 )], nelem, scale1, numv_sq );
    compute_edge( &coords_ptr[VINDEX( nelem, nelem, 0 )], nelem, scale1, numv_sq );

    // compute faces
    // i=0, nelem
    compute_face( &coords_ptr[VINDEX( 0, 0, 0 )], nelem, scale1, numv, numv_sq );
    compute_face( &coords_ptr[VINDEX( nelem, 0, 0 )], nelem, scale1, numv, numv_sq );
    // j=0, nelem
    compute_face( &coords_ptr[VINDEX( 0, 0, 0 )], nelem, scale1, 1, numv_sq );
    compute_face( &coords_ptr[VINDEX( 0, nelem, 0 )], nelem, scale1, 1, numv_sq );
    // k=0, nelem
    compute_face( &coords_ptr[VINDEX( 0, 0, 0 )], nelem, scale1, 1, numv );
    compute_face( &coords_ptr[VINDEX( 0, 0, nelem )], nelem, scale1, 1, numv );

    // initialize corner indices
    int i000 = VINDEX( 0, 0, 0 );
    int ia00 = VINDEX( nelem, 0, 0 );
    int i0t0 = VINDEX( 0, nelem, 0 );
    int iat0 = VINDEX( nelem, nelem, 0 );
    int i00g = VINDEX( 0, 0, nelem );
    int ia0g = VINDEX( nelem, 0, nelem );
    int i0tg = VINDEX( 0, nelem, nelem );
    int iatg = VINDEX( nelem, nelem, nelem );
    double cX, cY, cZ;
    int adaInts   = nelem;
    int tseInts   = nelem;
    int gammaInts = nelem;

    for( int i = 1; i < nelem; i++ )
    {
        for( int j = 1; j < nelem; j++ )
        {
            for( int k = 1; k < nelem; k++ )
            {
                // idx = VINDEX(i,j,k);
                double tse   = i * scale1;
                double ada   = j * scale2;
                double gamma = k * scale3;
                double tm1   = 1.0 - tse;
                double am1   = 1.0 - ada;
                double gm1   = 1.0 - gamma;

                cX = gm1 * ( am1 * ( tm1 * coords_ptr[i000] + tse * coords_ptr[i0t0] ) +
                             ada * ( tm1 * coords_ptr[ia00] + tse * coords_ptr[iat0] ) ) +
                     gamma * ( am1 * ( tm1 * coords_ptr[i00g] + tse * coords_ptr[i0tg] ) +
                               ada * ( tm1 * coords_ptr[ia0g] + tse * coords_ptr[iatg] ) );

                cY = gm1 * ( am1 * ( tm1 * coords_ptr[i000] + tse * coords_ptr[i0t0] ) +
                             ada * ( tm1 * coords_ptr[ia00] + tse * coords_ptr[iat0] ) ) +
                     gamma * ( am1 * ( tm1 * coords_ptr[i00g] + tse * coords_ptr[i0tg] ) +
                               ada * ( tm1 * coords_ptr[ia0g] + tse * coords_ptr[iatg] ) );

                cZ = gm1 * ( am1 * ( tm1 * coords_ptr[i000] + tse * coords_ptr[i0t0] ) +
                             ada * ( tm1 * coords_ptr[ia00] + tse * coords_ptr[iat0] ) ) +
                     gamma * ( am1 * ( tm1 * coords_ptr[i00g] + tse * coords_ptr[i0tg] ) +
                               ada * ( tm1 * coords_ptr[ia0g] + tse * coords_ptr[iatg] ) );

                double* ai0k = &coords_ptr[VINDEX( k, 0, i )];
                double* aiak = &coords_ptr[VINDEX( k, adaInts, i )];
                double* a0jk = &coords_ptr[VINDEX( k, j, 0 )];
                double* atjk = &coords_ptr[VINDEX( k, j, tseInts )];
                double* aij0 = &coords_ptr[VINDEX( 0, j, i )];
                double* aijg = &coords_ptr[VINDEX( gammaInts, j, i )];

                coords_ptr[VINDEX( i, j, k )] = ( am1 * ai0k[0] + ada * aiak[0] + tm1 * a0jk[0] + tse * atjk[0] +
                                                  gm1 * aij0[0] + gamma * aijg[0] ) /
                                                    2.0 -
                                                cX / 2.0;

                coords_ptr[nelem + 1 + VINDEX( i, j, k )] =
                    ( am1 * ai0k[nelem + 1] + ada * aiak[nelem + 1] + tm1 * a0jk[nelem + 1] + tse * atjk[nelem + 1] +
                      gm1 * aij0[nelem + 1] + gamma * aijg[nelem + 1] ) /
                        2.0 -
                    cY / 2.0;

                coords_ptr[2 * ( nelem + 1 ) + VINDEX( i, j, k )] =
                    ( am1 * ai0k[2 * ( nelem + 1 )] + ada * aiak[2 * ( nelem + 1 )] + tm1 * a0jk[2 * ( nelem + 1 )] +
                      tse * atjk[2 * ( nelem + 1 )] + gm1 * aij0[2 * ( nelem + 1 )] +
                      gamma * aijg[2 * ( nelem + 1 )] ) /
                        2.0 -
                    cZ / 2.0;
            }
        }
    }

#else
    for( int i = 0; i < numv; i++ )
    {
        for( int j = 0; j < numv; j++ )
        {
            for( int k = 0; k < numv; k++ )
            {
                idx = VINDEX( i, j, k );
                // blocked coordinate ordering
                coords_ptr[idx]                = i * scale1;
                coords_ptr[tot_numv + idx]     = j * scale2;
                coords_ptr[2 * tot_numv + idx] = k * scale3;
            }
        }
    }
#endif
    print_time( false, ttime1, utime1, stime1 );
    std::cout << "TSTT/MOAB: TFI time = " << ttime1 - ttime0 << " sec" << std::endl;
}

void build_connect( const int nelem, const int vstart, int*& connect )
{
    // allocate the memory
    int nume_tot = nelem * nelem * nelem;
    connect      = new int[8 * nume_tot];

    int vijk;
    int numv    = nelem + 1;
    int numv_sq = numv * numv;
    int idx     = 0;
    for( int i = 0; i < nelem; i++ )
    {
        for( int j = 0; j < nelem; j++ )
        {
            for( int k = 0; k < nelem; k++ )
            {
                vijk           = vstart + VINDEX( i, j, k );
                connect[idx++] = vijk;
                connect[idx++] = vijk + 1;
                connect[idx++] = vijk + 1 + numv;
                connect[idx++] = vijk + numv;
                connect[idx++] = vijk + numv * numv;
                connect[idx++] = vijk + 1 + numv * numv;
                connect[idx++] = vijk + 1 + numv + numv * numv;
                connect[idx++] = vijk + numv + numv * numv;
                assert( i <= numv * numv * numv );
            }
        }
    }
}
